# create_alignment_sets_aaa.PLS
#
# Cared for by Albert Vilella <>
#
# Copyright Albert Vilella
#
# You may distribute this module under the same terms as perl itself

# POD documentation - main docs before the code

=head1 NAME

create_alignment_sets.PLS - open a list of 1-1(-1-1-1-1-1-1-1-1-1-1)
orthologies and the multifasta nucleotide files with the ">locus_id_"
description, and create a directory with each set of fastas for each
entry in the orthologies list. If no orthology list exist, it will do
the same but matching the ids of each species straightaway

=head1 SYNOPSIS

create_alignment_sets.PLS
 -i1 file1.fna
 -i2 file2.fna
 -i3 file3.fna
 [-i4 file4.fna]
 [-i5 file5.fna]
 [-i6 file6.fna]
 [-i7 file7.fna]
 [-i8 file8.fna]
 [-i9 file9.fna]
 [-i10 file10.fna]
 [-i11 file11.fna]
 [-i12 file12.fna]
 [-orth all_orthomcl.out]
 -o orthoset_dir
 -sets 5
 [-funccats1 funccats/pabyss/pabyss_gene_oid_funccat.txt]

=head1 DESCRIPTION

The columns in the orth file and the infiles have to be in the same
order:

ORTHOMCL42(3 genes,3 taxa):      21397376(_NC_003995) 30264936(_NC_003997) 47530432(_NC_007530)

-sets 5   option to indicate which kind of entries are parsed: sets 5
          for a group of 6 genomes will retrieve sets of type
          1-1-1-1-1.

[-altinput] alternative input file for cases when a gene name is a 
            tag in the orth or viceversa (only for 1 file for the moment)

[-funccats1] a comma separated list of (geneids,funccats) for input1
             to further subdivide by functional category

=head1 AUTHOR - Albert Vilella

Email 

Describe contact details here

=head1 CONTRIBUTORS

Additional contributors names and emails here

=cut


# Let the code begin...

use strict;
use Bio::SeqIO;
use Getopt::Long;

my ($input1,$input2,$input3,$input4,$input5,
    $input6,$input7,$input8,$input9,$input10,
    $input11,$input12,
    $genes,$taxa,$altinput,$orth,$outdir,$funccats1,$dmel_ortholog) = undef;
my ($logfile) = undef;
my $merged = undef;

GetOptions(
	   'i1:s' => \$input1,
	   'i2:s' => \$input2,
	   'i3:s' => \$input3,
	   'i4:s' => \$input4,
	   'i5:s' => \$input5,
	   'i6:s' => \$input6,
	   'i7:s' => \$input7,
	   'I8:s' => \$input8,
	   'i9:s' => \$input9,
	   'i10:s' => \$input10,
	   'i11:s' => \$input11,
	   'i12:s' => \$input12,
	   'alt|alti|altinput:s' => \$altinput,
	   'orth:s'  => \$orth,
           'g|genes:s' => \$genes,
           'm|merged' => \$merged,
           't|taxa:s' => \$taxa,
	   'o|outdir:s'=> \$outdir,
	   'f|fun|funccats1:s'=> \$funccats1,
	   'dmel|dmel_ortholog'=> \$dmel_ortholog,
          );

if (!$outdir) {
    $outdir = "./";
}

unless (-d "$outdir") {
    mkdir "$outdir" or die "couldnt create directory: $!";
}

print "Creating orthomlc alignment sets...\n";
my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
my $logfile_time = 
    sprintf ("%04d%02d%02d_%02d%02d%02d", $year+1900,$mon,$mday,$hour,$min,$sec);

$logfile = Bio::Root::IO->new
    (
     -file=>Bio::Root::IO->catfile
     (
      ">","$outdir","logfile.create_alignment_sets.$logfile_time.txt"
     ),
    );

printf ("Writing log to %s\n", $logfile->file);
$logfile->_print
    (
     "Started $logfile_time\n"
    );

my $in1 = new Bio::SeqIO(-file => $input1,
                         -format => 'fasta');
my $in2 = new Bio::SeqIO(-file => $input2,
                         -format => 'fasta');

my $altin='';
if ($altinput) {
    $altin = new Bio::SeqIO(-file => $altinput,
                            -format => 'fasta');
}
my $in3;
my $in4;
my $in5;
my $in6;
my $in7;
my $in8;
my $in9;
my $in10;
my $in11;
my $in12;
if ($input3) {
    $in3 = new Bio::SeqIO(-file => $input3,
                          -format => 'fasta');
    if ($input4) {
        $in4 = new Bio::SeqIO(-file => $input4,
                              -format => 'fasta');
        if ($input5) {
            $in5 = new Bio::SeqIO(-file => $input5,
                                  -format => 'fasta');
            if ($input6) {
                $in6 = new Bio::SeqIO(-file => $input6,
                                      -format => 'fasta');
                if ($input7) {
                    $in7 = new Bio::SeqIO(-file => $input7,
                                          -format => 'fasta');
                    if ($input8) {
                        $in8 = new Bio::SeqIO(-file => $input8,
                                              -format => 'fasta');
                        if ($input9) {
                            $in9 = new Bio::SeqIO(-file => $input9,
                                                  -format => 'fasta');
                            if ($input10) {
                                $in10 = new Bio::SeqIO(-file => $input10,
                                                       -format => 'fasta');
                                if ($input11) {
                                    $in11 = new Bio::SeqIO(-file => $input11,
                                                           -format => 'fasta');
                                    if ($input12) {
                                        $in12 = new Bio::SeqIO(-file => $input12,
                                                               -format => 'fasta');
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

my %seqs;
my ($display_id);
print "Loading file1...\n";
my $num_processed = 0;
while ( my $seq = $in1->next_seq ) {
    $logfile->_print("file $input1\n");
    $logfile->_print("no_$num_processed $display_id\n");
    $num_processed++;
    $display_id = $seq->display_id;
    if ($orth) {
        $seqs{$display_id} = $seq;
    } elsif ($dmel_ortholog && !($display_id =~ /FBgn/)) {
        my $dmel_ortholog_display_id = $seq->desc;
        $dmel_ortholog_display_id =~ /(FBgn\S+)/;
        $dmel_ortholog_display_id = $1;
        $seqs{i01}{$dmel_ortholog_display_id} = $seq;
    } else {
        $seqs{i01}{$display_id} = $seq;
    }
}

$num_processed=0;
print "Loading file2...\n";
while ( my $seq = $in2->next_seq ) {
    $logfile->_print("file $input2\n");
    $logfile->_print("no_$num_processed $display_id\n");
    $num_processed++;
    $display_id = $seq->display_id;
    if ($orth) {
        $seqs{$display_id} = $seq;
    } elsif ($dmel_ortholog && !($display_id =~ /FBgn/)) {
        my $dmel_ortholog_display_id = $seq->desc;
        $dmel_ortholog_display_id =~ /(FBgn\S+)/;
        $dmel_ortholog_display_id = $1;
        $seqs{i02}{$dmel_ortholog_display_id} = $seq;
    } else {
        $seqs{i02}{$display_id} = $seq;
    }
}

$num_processed=0;
print "Loading file3...\n";
if ($input3) {
    while ( my $seq = $in3->next_seq ) {
        $logfile->_print("file $input3 \n");
        $logfile->_print("no_$num_processed $display_id\n");
        $num_processed++;
        $display_id = $seq->display_id;
        if ($orth) {
        $seqs{$display_id} = $seq;
    } elsif ($dmel_ortholog && !($display_id =~ /FBgn/)) {
        my $dmel_ortholog_display_id = $seq->desc;
        $dmel_ortholog_display_id =~ /(FBgn\S+)/;
        $dmel_ortholog_display_id = $1;
        $seqs{i03}{$dmel_ortholog_display_id} = $seq;
    } else {
        $seqs{i03}{$display_id} = $seq;
    }
    }
}

$num_processed=0;
if ($altinput) {
    print "Loading altseq...\n";
    while ( my $seq = $altin->next_seq ) {
        $logfile->_print("file $altinput\n");
        $logfile->_print("no_$num_processed $display_id\n");
        $num_processed++;
        $display_id = $seq->display_id;
        if ($orth) {
        $seqs{$display_id} = $seq;
    } else {
        $seqs{alt}{$display_id} = $seq;
    }
    }
}

$num_processed=0;
if ($input4) {
    print "Loading file4...\n";
    while ( my $seq = $in4->next_seq ) {
        $logfile->_print("file $input4 \n");
        $logfile->_print("no_$num_processed $display_id\n");
        $num_processed++;
        $display_id = $seq->display_id;
        if ($orth) {
        $seqs{$display_id} = $seq;
    } elsif ($dmel_ortholog && !($display_id =~ /FBgn/)) {
        my $dmel_ortholog_display_id = $seq->desc;
        $dmel_ortholog_display_id =~ /(FBgn\S+)/;
        $dmel_ortholog_display_id = $1;
        $seqs{i04}{$dmel_ortholog_display_id} = $seq;
    } else {
        $seqs{i04}{$display_id} = $seq;
    }
    }
}

$num_processed=0;
if ($input5) {
    print "Loading file5...\n";
    while ( my $seq = $in5->next_seq ) {
        $logfile->_print("file $input5 \n");
        $logfile->_print("no_$num_processed $display_id\n");
        $num_processed++;
        $display_id = $seq->display_id;
        if ($orth) {
        $seqs{$display_id} = $seq;
    } elsif ($dmel_ortholog && !($display_id =~ /FBgn/)) {
        my $dmel_ortholog_display_id = $seq->desc;
        $dmel_ortholog_display_id =~ /(FBgn\S+)/;
        $dmel_ortholog_display_id = $1;
        $seqs{i05}{$dmel_ortholog_display_id} = $seq;
    } else {
        $seqs{i05}{$display_id} = $seq;
    }
    }
}

$num_processed=0;
if ($input6) {
    print "Loading file6...\n";
    while ( my $seq = $in6->next_seq ) {
        $logfile->_print("file $input6 \n");
        $logfile->_print("no_$num_processed $display_id\n");
        $num_processed++;
        $display_id = $seq->display_id;
        if ($orth) {
        $seqs{$display_id} = $seq;
    } elsif ($dmel_ortholog && !($display_id =~ /FBgn/)) {
        my $dmel_ortholog_display_id = $seq->desc;
        $dmel_ortholog_display_id =~ /(FBgn\S+)/;
        $dmel_ortholog_display_id = $1;
        $seqs{i06}{$dmel_ortholog_display_id} = $seq;
    } else {
        $seqs{i06}{$display_id} = $seq;
    }
    }
}

$num_processed=0;
if ($input7) {
    print "Loading file7...\n";
    while ( my $seq = $in7->next_seq ) {
        $logfile->_print("file $input7 \n");
        $logfile->_print("no_$num_processed $display_id\n");
        $num_processed++;
        $display_id = $seq->display_id;
        if ($orth) {
        $seqs{$display_id} = $seq;
    } elsif ($dmel_ortholog && !($display_id =~ /FBgn/)) {
        my $dmel_ortholog_display_id = $seq->desc;
        $dmel_ortholog_display_id =~ /(FBgn\S+)/;
        $dmel_ortholog_display_id = $1;
        $seqs{i07}{$dmel_ortholog_display_id} = $seq;
    } else {
        $seqs{i07}{$display_id} = $seq;
    }
    }
}

$num_processed=0;
if ($input8) {
    print "Loading file8...\n";
    while ( my $seq = $in8->next_seq ) {
        $logfile->_print("file $input8 \n");
        $logfile->_print("no_$num_processed $display_id\n");
        $num_processed++;
        $display_id = $seq->display_id;
        if ($orth) {
        $seqs{$display_id} = $seq;
    } elsif ($dmel_ortholog && !($display_id =~ /FBgn/)) {
        my $dmel_ortholog_display_id = $seq->desc;
        $dmel_ortholog_display_id =~ /(FBgn\S+)/;
        $dmel_ortholog_display_id = $1;
        $seqs{i08}{$dmel_ortholog_display_id} = $seq;
    } else {
        $seqs{i08}{$display_id} = $seq;
    }
    }
}

$num_processed=0;
if ($input9) {
    print "Loading file9...\n";
    while ( my $seq = $in9->next_seq ) {
        $logfile->_print("file $input9 \n");
        $logfile->_print("no_$num_processed $display_id\n");
        $num_processed++;
        $display_id = $seq->display_id;
        if ($orth) {
        $seqs{$display_id} = $seq;
    } elsif ($dmel_ortholog && !($display_id =~ /FBgn/)) {
        my $dmel_ortholog_display_id = $seq->desc;
        $dmel_ortholog_display_id =~ /(FBgn\S+)/;
        $dmel_ortholog_display_id = $1;
        $seqs{i09}{$dmel_ortholog_display_id} = $seq;
    } else {
        $seqs{i09}{$display_id} = $seq;
    }
    }
}

$num_processed=0;
if ($input10) {
    print "Loading file10...\n";
    while ( my $seq = $in10->next_seq ) {
        $logfile->_print("file $input10 \n");
        $logfile->_print("no_$num_processed $display_id\n");
        $num_processed++;
        $display_id = $seq->display_id;
        if ($orth) {
        $seqs{$display_id} = $seq;
    } elsif ($dmel_ortholog && !($display_id =~ /FBgn/)) {
        my $dmel_ortholog_display_id = $seq->desc;
        $dmel_ortholog_display_id =~ /(FBgn\S+)/;
        $dmel_ortholog_display_id = $1;
        $seqs{i10}{$dmel_ortholog_display_id} = $seq;
    } else {
        $seqs{i10}{$display_id} = $seq;
    }
    }
}

$num_processed=0;
if ($input11) {
    print "Loading file11...\n";
    while ( my $seq = $in11->next_seq ) {
        $logfile->_print("file $input11\n");
        $logfile->_print("no_$num_processed $display_id\n");
        $num_processed++;
        $display_id = $seq->display_id;
        if ($orth) {
        $seqs{$display_id} = $seq;
    } elsif ($dmel_ortholog && !($display_id =~ /FBgn/)) {
        my $dmel_ortholog_display_id = $seq->desc;
        $dmel_ortholog_display_id =~ /(FBgn\S+)/;
        $dmel_ortholog_display_id = $1;
        $seqs{i11}{$dmel_ortholog_display_id} = $seq;
    } else {
        $seqs{i11}{$display_id} = $seq;
    }
    }
}

$num_processed=0;
if ($input12) {
    print "Loading file12...\n";
    while ( my $seq = $in12->next_seq ) {
        $logfile->_print("file $input12\n");
        $logfile->_print("no_$num_processed $display_id\n");
        $num_processed++;
        $display_id = $seq->display_id;
        if ($orth) {
        $seqs{$display_id} = $seq;
    } elsif ($dmel_ortholog && !($display_id =~ /FBgn/)) {
        my $dmel_ortholog_display_id = $seq->desc;
        $dmel_ortholog_display_id =~ /(FBgn\S+)/;
        $dmel_ortholog_display_id = $1;
        $seqs{i12}{$dmel_ortholog_display_id} = $seq;
    } else {
        $seqs{i12}{$display_id} = $seq;
    }
    }
}

$num_processed=0;

my %funccats1 = undef;
my $count = 0;
if ($funccats1) {
    open (FUNCCATS1,"$funccats1") or die "cannot open orth file: $!";
    print "Loading funccats1...\n";
    while (<FUNCCATS1>) {
        if ($_ =~ /(\S+)\,(\S+)/g) {
            $funccats1{$1} = $2;
            $count++;
        } else {
            warn "error parsing funccats1: $_\n";
        }
    }
}
print "Loaded $count functional category entries...\n";

$count = 0;
my $orthology;
my %hash_of_orthologies;
my $out;
my $outseq;
my $geneid = undef;
print "Creating sets...\n";

if ($orth) {
    my $regexp;
    my $pattern;
    $pattern = 'ORTHOMCL(\S+)\(' . "$genes" . '\ genes\,' . "$taxa" . '\ taxa\)\:';

    $count;
    for ($count=1;$count<=$genes;$count++) {
        $pattern .= '\s+(\S+)\((\S+)\)';
    }

    $regexp = qr/^${pattern}/i;

    open (ORTH,"$orth") or die "cannot open orth file: $!";
    print "Loading orthologies...\n";
    while (<ORTH>) {
        if ($_ =~ /$regexp/) {
            $orthology = sprintf("%05d", $1);
            $hash_of_orthologies{$orthology}{$3}  = $2;
            $hash_of_orthologies{$orthology}{$5}  = $4;
            eval {$hash_of_orthologies{$orthology}{$7}  =  $6;};
            eval {$hash_of_orthologies{$orthology}{$9}  =  $8;};
            eval {$hash_of_orthologies{$orthology}{$11} = $10;};
            eval {$hash_of_orthologies{$orthology}{$13} = $12;};
            eval {$hash_of_orthologies{$orthology}{$15} = $14;};
            eval {$hash_of_orthologies{$orthology}{$17} = $16;};
            eval {$hash_of_orthologies{$orthology}{$19} = $18;};
            eval {$hash_of_orthologies{$orthology}{$21} = $20;};
            eval {$hash_of_orthologies{$orthology}{$23} = $22;};
            eval {$hash_of_orthologies{$orthology}{$25} = $24;};
            $orthology = "";
            $count++;
        }
    }

    close ORTH;
    print "Loaded $count orthologous entries...\n";
    $count = 0;

    foreach my $orthonumber (sort keys %hash_of_orthologies) {
        my $spnum = 1; my $firstgene = "";
        foreach my $species (sort keys %{ $hash_of_orthologies{$orthonumber} }) {
            $geneid = $hash_of_orthologies{$orthonumber}{$species};
            if (length($geneid) == 0) {
                next;
            }                   # get rid of null evals
            if (1 == $spnum) {
                $firstgene = $geneid;
                my $subdir = undef;
                if ($funccats1) {
                    my $cat = $funccats1{$firstgene} || "w";
                    unless (-d "$outdir/$cat") {
                        mkdir ("$outdir/$cat");
                    }
                    $subdir = "$outdir/$cat/$firstgene";
                } else {
                    $subdir = "$outdir/$firstgene";
                }
                unless (-d "$subdir") {
                    mkdir ("$subdir");
                }
                $logfile->_print
                    (
                     "$subdir\n"
                    );
                $out = new Bio::SeqIO(-file => ">$subdir/$geneid.fasta");
            }
            if (($seqs{$geneid})) {
                $outseq = $seqs{$geneid};
                my $species_mark_primary_id = "sp" . sprintf("%02d", $spnum);
                $species_mark_primary_id .= "_" . $species . "_" . $outseq->primary_id;
                $outseq->display_id("$species_mark_primary_id");
                $out->write_seq($outseq);
            } else {
                warn "have not found $geneid in $species for orthology $orthonumber\n";
            }
            $spnum++;
        }
    }
} else {
    my %geneid_occurrences;
    foreach my $species (sort keys %seqs) {
        foreach my $geneid (sort keys %{ $seqs{$species} }) {
            $geneid_occurrences{$geneid} += 1;
        }
    }
    foreach my $geneid (sort keys %geneid_occurrences) {
        if ($taxa == $geneid_occurrences{$geneid}) {
            my $spnum = 1; my $firstgene = "";
            foreach my $species (sort keys %seqs) {
                if (1 == $spnum) {
                    $firstgene = $geneid;
                    my $subdir = undef;
                    if ($funccats1) {
                        my $cat = $funccats1{$firstgene} || "w";
                        unless (-d "$outdir/$cat") {
                            mkdir ("$outdir/$cat");
                        }
                        $subdir = "$outdir/$cat/$firstgene";
                    } else {
                        $subdir = "$outdir/$firstgene";
                    }
                    unless (-d "$subdir") {
                        mkdir ("$subdir");
                    }
                    $logfile->_print
                        (
                         "$subdir\n"
                        );
                    $out = new Bio::SeqIO(-file => ">$subdir/$geneid.fasta");
                }
                $outseq = $seqs{$species}{$geneid};
                    my $species_mark_primary_id = "sp" . sprintf("%02d", $spnum);
                $species_mark_primary_id .= "_" . $species . "_" . $outseq->primary_id;
                $outseq->display_id("$species_mark_primary_id");
                $out->write_seq($outseq);
                $spnum++;
            }
        }
    }

}

$logfile->_print("End\n");
$logfile->close();

print "\n";
print "Done.\n";

1;
